Slip 6


Q.1. Write a python program to implement Polynomial Regression for  
Boston Housing Dataset.     

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the Boston Housing dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame

print("Dataset Loaded Successfully!\n")
print("First 5 Rows:\n", df.head(), "\n")
print("Columns:\n", df.columns, "\n")

# Step 2: Define features (X) and target (y)
X = df.drop("MEDV", axis=1)  # MEDV = Median value of owner-occupied homes
y = df["MEDV"]

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Apply Polynomial Features (degree = 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

print(f"Polynomial features created: {X_train_poly.shape[1]} total features\n")

# Step 5: Scale the data (important for regression stability)
scaler = StandardScaler()
X_train_poly = scaler.fit_transform(X_train_poly)
X_test_poly = scaler.transform(X_test_poly)

# Step 6: Train the Linear Regression model on polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test_poly)

# Step 8: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("✅ Model Evaluation:")
print("Mean Squared Error:", round(mse, 2))
print("R² Score:", round(r2, 4))

# Step 9: Example comparison (first 5 predictions)
comparison = pd.DataFrame({'Actual': y_test.values[:5], 'Predicted': y_pred[:5]})
print("\nSample Predictions:\n", comparison)

Q.2. Use K-means clustering model and classify the employees into various income groups 
or clusters. Preprocess data if require (i.e. drop missing or null values). 

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# 2. Create Sample Employee Dataset
np.random.seed(42)
data = pd.DataFrame({
    'EmployeeID': range(1, 51),
    'Income': np.random.randint(20000, 120000, size=50),  # Annual income
    'Age': np.random.randint(22, 60, size=50),
    'Experience': np.random.randint(1, 35, size=50)
})

print("Sample Employee Dataset:")
print(data.head())

# 3. Data Preprocessing
X = data[['Income', 'Age', 'Experience']]  # Features for clustering

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Determine Optimal k
# (a) Elbow Method
wcss = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)  # Explicitly set n_init
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot Elbow
plt.figure(figsize=(8,5))
plt.plot(K, wcss, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.title('Elbow Method')
plt.show()

# (b) Silhouette Score
print("\nSilhouette Scores:")
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)  # Explicitly set n_init
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    print(f"k={k} -> Silhouette Score = {score:.3f}")

# Choose optimal k (based on Elbow + Silhouette)
k_opt = 3  # You can adjust based on Elbow & Silhouette

# 5. Apply K-Means Clustering
kmeans = KMeans(n_clusters=k_opt, random_state=42, n_init=10)  # Explicitly set n_init
labels = kmeans.fit_predict(X_scaled)
data['Cluster'] = labels

print("\nDataset with Cluster Labels:")
print(data.head())

# 6. Analyze Clusters
print("\nCluster-wise Mean Values:")
print(data.groupby('Cluster').mean(numeric_only=True))  # Add numeric_only=True to avoid warning in future

# 7. Visualize clusters in 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=labels, cmap='viridis', edgecolor='k', s=50)
plt.title('K-Means Clustering of Employees (2D PCA)')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()

# 8. Correctly Label Clusters Based on Income
cluster_income = data.groupby('Cluster')['Income'].mean().sort_values()
income_labels = ['Low Income', 'Medium Income', 'High Income']
cluster_labels = {cluster: label for cluster, label in zip(cluster_income.index, income_labels)}
data['Income_Group'] = data['Cluster'].map(cluster_labels)

print("\nDataset with Income Group Labels:")
print(data.head())
